1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    * 
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   * 
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.solr.internal.csv;
18  
19  import java.io.IOException;
20  import java.io.Reader;
21  import java.io.InputStreamReader;
22  import java.io.InputStream;
23  import java.util.ArrayList;
24  
25  
26  /**
27   * Parses CSV files according to the specified configuration.
28   *
29   * Because CSV appears in many different dialects, the parser supports many
30   * configuration settings by allowing the specification of a {@link CSVStrategy}.
31   * 
32   * <p>Parsing of a csv-string having tabs as separators,
33   * '"' as an optional value encapsulator, and comments starting with '#':</p>
34   * <pre>
35   *  String[][] data = 
36   *   (new CSVParser(new StringReader("a\tb\nc\td"), new CSVStrategy('\t','"','#'))).getAllValues();
37   * </pre>
38   * 
39   * <p>Parsing of a csv-string in Excel CSV format</p>
40   * <pre>
41   *  String[][] data =
42   *   (new CSVParser(new StringReader("a;b\nc;d"), CSVStrategy.EXCEL_STRATEGY)).getAllValues();
43   * </pre>
44   * 
45   * <p>
46   * Internal parser state is completely covered by the strategy
47   * and the reader-state.</p>
48   * 
49   * <p>see <a href="package-summary.html">package documentation</a> 
50   * for more details</p>
51   */
52  public class CSVParser {
53  
54    /** length of the initial token (content-)buffer */
55    private static final int INITIAL_TOKEN_LENGTH = 50;
56    
57    // the token types
58    /** Token has no valid content, i.e. is in its initialized state. */
59    protected static final int TT_INVALID = -1;
60    /** Token with content, at beginning or in the middle of a line. */
61    protected static final int TT_TOKEN = 0;
62    /** Token (which can have content) when end of file is reached. */
63    protected static final int TT_EOF = 1;
64    /** Token with content when end of a line is reached. */
65    protected static final int TT_EORECORD = 2;
66  
67    /** Immutable empty String array. */
68    private static final String[] EMPTY_STRING_ARRAY = new String[0];
69     
70    // the input stream
71    private final ExtendedBufferedReader in;
72  
73    private final CSVStrategy strategy;
74    
75    // the following objects are shared to reduce garbage 
76    /** A record buffer for getLine(). Grows as necessary and is reused. */
77    private final ArrayList record = new ArrayList();
78    private final Token reusableToken = new Token();
79    private final CharBuffer wsBuf = new CharBuffer();
80    private final CharBuffer code = new CharBuffer(4);
81  
82    
83    /**
84     * Token is an internal token representation.
85     * 
86     * It is used as contract between the lexer and the parser. 
87     */
88    static class Token {
89      /** Token type, see TT_xxx constants. */
90      int type = TT_INVALID;
91      /** The content buffer. */
92      CharBuffer content = new CharBuffer(INITIAL_TOKEN_LENGTH);
93      /** Token ready flag: indicates a valid token with content (ready for the parser). */
94      boolean isReady;
95      
96      Token reset() {
97          content.clear();
98          type = TT_INVALID;
99          isReady = false;
100         return this;
101     }
102   }
103   
104   // ======================================================
105   //  the constructor
106   // ======================================================
107   
108   /**
109    * CSV parser using the default {@link CSVStrategy}.
110    * 
111    * @param input a Reader containing "csv-formatted" input
112    */
113   public CSVParser(Reader input) {
114     // note: must match default-CSV-strategy !!
115     this(input, ',');
116   }
117   
118   /**
119    * Customized value delimiter parser.
120    * 
121    * The parser follows the default {@link CSVStrategy}
122    * except for the delimiter setting.
123    * 
124    * @param input a Reader based on "csv-formatted" input
125    * @param delimiter a Char used for value separation
126    * @deprecated use {@link #CSVParser(Reader,CSVStrategy)}.
127    */
128   public CSVParser(Reader input, char delimiter) {
129     this(input, delimiter, '"', CSVStrategy.COMMENTS_DISABLED);
130   }
131   
132   /**
133    * Customized csv parser.
134    * 
135    * The parser parses according to the given CSV dialect settings.
136    * Leading whitespaces are truncated, unicode escapes are
137    * not interpreted and empty lines are ignored.
138    * 
139    * @param input a Reader based on "csv-formatted" input
140    * @param delimiter a Char used for value separation
141    * @param encapsulator a Char used as value encapsulation marker
142    * @param commentStart a Char used for comment identification
143    * @deprecated use {@link #CSVParser(Reader,CSVStrategy)}.
144    */
145   public CSVParser(Reader input, char delimiter, char encapsulator, char commentStart) {
146     this(input, new CSVStrategy(delimiter, encapsulator, commentStart));
147   }
148 
149   /**
150    * Customized CSV parser using the given {@link CSVStrategy}
151    *
152    * @param input a Reader containing "csv-formatted" input
153    * @param strategy the CSVStrategy used for CSV parsing
154    */
155   public CSVParser(Reader input, CSVStrategy strategy) {
156     this.in = new ExtendedBufferedReader(input);
157     this.strategy = strategy;
158   }
159   
160   // ======================================================
161   //  the parser
162   // ======================================================
163   
164   /**
165    * Parses the CSV according to the given strategy
166    * and returns the content as an array of records
167    * (whereas records are arrays of single values).
168    * <p>
169    * The returned content starts at the current parse-position in
170    * the stream.
171    * 
172    * @return matrix of records x values ('null' when end of file)
173    * @throws IOException on parse error or input read-failure
174    */
175   public String[][] getAllValues() throws IOException {
176     ArrayList records = new ArrayList();
177     String[] values;
178     String[][] ret = null;
179     while ((values = getLine()) != null)  {
180       records.add(values);
181     }
182     if (records.size() > 0) {
183       ret = new String[records.size()][];
184       records.toArray(ret);
185     }
186     return ret;
187   }
188   
189   /**
190    * Parses the CSV according to the given strategy
191    * and returns the next csv-value as string.
192    * 
193    * @return next value in the input stream ('null' when end of file)
194    * @throws IOException on parse error or input read-failure
195    */
196   public String nextValue() throws IOException {
197     Token tkn = nextToken();
198     String ret = null;
199     switch (tkn.type) {
200       case TT_TOKEN:
201       case TT_EORECORD: 
202         ret = tkn.content.toString();
203         break;
204       case TT_EOF:
205         ret = null;
206         break;
207       case TT_INVALID:
208       default:
209         // error no token available (or error)
210         throw new IOException(
211           "(line " + getLineNumber() 
212           + ") invalid parse sequence");
213         // unreachable: break;
214     }
215     return ret;
216   }
217   
218   /**
219    * Parses from the current point in the stream til
220    * the end of the current line.
221    * 
222    * @return array of values til end of line 
223    *        ('null' when end of file has been reached)
224    * @throws IOException on parse error or input read-failure
225    */
226   public String[] getLine() throws IOException {
227     String[] ret = EMPTY_STRING_ARRAY;
228     record.clear();
229     while (true) {
230         reusableToken.reset();
231         nextToken(reusableToken);
232         switch (reusableToken.type) {
233             case TT_TOKEN:
234                 record.add(reusableToken.content.toString());
235                 break;
236             case TT_EORECORD:
237                 record.add(reusableToken.content.toString());
238                 break;
239             case TT_EOF:
240                 if (reusableToken.isReady) {
241                     record.add(reusableToken.content.toString());
242                 } else {
243                     ret = null;
244                 }
245                 break;
246             case TT_INVALID:
247             default:
248                 // error: throw IOException
249                 throw new IOException("(line " + getLineNumber() + ") invalid parse sequence");
250             // unreachable: break;
251         }
252         if (reusableToken.type != TT_TOKEN) {
253             break;
254         }
255     }
256     if (!record.isEmpty()) {
257       ret = (String[]) record.toArray(new String[record.size()]);
258     }
259     return ret;
260   }
261   
262   /**
263    * Returns the current line number in the input stream.
264    * 
265    * ATTENTION: in case your csv has multiline-values the returned
266    *            number does not correspond to the record-number
267    * 
268    * @return  current line number
269    */
270   public int getLineNumber() {
271     return in.getLineNumber();  
272   }
273   
274   // ======================================================
275   //  the lexer(s)
276   // ======================================================
277  
278   /**
279    * Convenience method for <code>nextToken(null)</code>.
280    */
281   protected Token nextToken() throws IOException {
282       return nextToken(new Token());
283   }
284   
285  /**
286    * Returns the next token.
287    * 
288    * A token corresponds to a term, a record change or an
289    * end-of-file indicator.
290    * 
291    * @param tkn an existing Token object to reuse. The caller is responsible to initialize the
292    * Token.
293    * @return the next token found
294    * @throws IOException on stream access error
295    */
296   protected Token nextToken(Token tkn) throws IOException {
297     wsBuf.clear(); // resuse
298     
299     // get the last read char (required for empty line detection)
300     int lastChar = in.readAgain();
301     
302     //  read the next char and set eol
303     /* note: unfourtunately isEndOfLine may consumes a character silently.
304      *       this has no effect outside of the method. so a simple workaround
305      *       is to call 'readAgain' on the stream...
306      *       uh: might using objects instead of base-types (jdk1.5 autoboxing!)
307      */
308     int c = in.read();
309     boolean eol = isEndOfLine(c);
310     c = in.readAgain();
311      
312     //  empty line detection: eol AND (last char was EOL or beginning)
313     while (strategy.getIgnoreEmptyLines() && eol 
314       && (lastChar == '\n' 
315       || lastChar == ExtendedBufferedReader.UNDEFINED) 
316       && !isEndOfFile(lastChar)) {
317       // go on char ahead ...
318       lastChar = c;
319       c = in.read();
320       eol = isEndOfLine(c);
321       c = in.readAgain();
322       // reached end of file without any content (empty line at the end)
323       if (isEndOfFile(c)) {
324         tkn.type = TT_EOF;
325         return tkn;
326       }
327     }
328 
329     // did we reached eof during the last iteration already ? TT_EOF
330     if (isEndOfFile(lastChar) || (lastChar != strategy.getDelimiter() && isEndOfFile(c))) {
331       tkn.type = TT_EOF;
332       return tkn;
333     } 
334     
335     //  important: make sure a new char gets consumed in each iteration
336     while (!tkn.isReady && tkn.type != TT_EOF) {
337       // ignore whitespaces at beginning of a token
338       while (strategy.getIgnoreLeadingWhitespaces() && isWhitespace(c) && !eol) {
339         wsBuf.append((char) c);
340         c = in.read();
341         eol = isEndOfLine(c);
342       }
343       // ok, start of token reached: comment, encapsulated, or token
344       if (c == strategy.getCommentStart()) {
345         // ignore everything till end of line and continue (incr linecount)
346         in.readLine();
347         tkn = nextToken(tkn.reset());
348       } else if (c == strategy.getDelimiter()) {
349         // empty token return TT_TOKEN("")
350         tkn.type = TT_TOKEN;
351         tkn.isReady = true;
352       } else if (eol) {
353         // empty token return TT_EORECORD("")
354         //noop: tkn.content.append("");
355         tkn.type = TT_EORECORD;
356         tkn.isReady = true;
357       } else if (c == strategy.getEncapsulator()) {
358         // consume encapsulated token
359         encapsulatedTokenLexer(tkn, c);
360       } else if (isEndOfFile(c)) {
361         // end of file return TT_EOF()
362         //noop: tkn.content.append("");
363         tkn.type = TT_EOF;
364         tkn.isReady = true;
365       } else {
366         // next token must be a simple token
367         // add removed blanks when not ignoring whitespace chars...
368         if (!strategy.getIgnoreLeadingWhitespaces()) {
369           tkn.content.append(wsBuf);
370         }
371         simpleTokenLexer(tkn, c);
372       }
373     }
374     return tkn;  
375   }
376   
377   /**
378    * A simple token lexer
379    * 
380    * Simple token are tokens which are not surrounded by encapsulators.
381    * A simple token might contain escaped delimiters (as \, or \;). The
382    * token is finished when one of the following conditions become true:
383    * <ul>
384    *   <li>end of line has been reached (TT_EORECORD)</li>
385    *   <li>end of stream has been reached (TT_EOF)</li>
386    *   <li>an unescaped delimiter has been reached (TT_TOKEN)</li>
387    * </ul>
388    *  
389    * @param tkn  the current token
390    * @param c    the current character
391    * @return the filled token
392    * 
393    * @throws IOException on stream access error
394    */
395   private Token simpleTokenLexer(Token tkn, int c) throws IOException {
396     for (;;) {
397       if (isEndOfLine(c)) {
398         // end of record
399         tkn.type = TT_EORECORD;
400         tkn.isReady = true;
401         break;
402       } else if (isEndOfFile(c)) {
403         // end of file
404         tkn.type = TT_EOF;
405         tkn.isReady = true;
406         break;
407       } else if (c == strategy.getDelimiter()) {
408         // end of token
409         tkn.type = TT_TOKEN;
410         tkn.isReady = true;
411         break;
412       } else if (c == '\\' && strategy.getUnicodeEscapeInterpretation() && in.lookAhead() == 'u') {
413         // interpret unicode escaped chars (like \u0070 -> p)
414         tkn.content.append((char) unicodeEscapeLexer(c));
415       } else if (c == strategy.getEscape()) {
416         tkn.content.append((char)readEscape(c));
417       } else {
418         tkn.content.append((char) c);
419       }
420       
421       c = in.read();
422     }
423 
424     if (strategy.getIgnoreTrailingWhitespaces()) {
425       tkn.content.trimTrailingWhitespace();
426     }
427 
428     return tkn;
429   }
430   
431   
432   /**
433    * An encapsulated token lexer
434    * 
435    * Encapsulated tokens are surrounded by the given encapsulating-string.
436    * The encapsulator itself might be included in the token using a
437    * doubling syntax (as "", '') or using escaping (as in \", \').
438    * Whitespaces before and after an encapsulated token are ignored.
439    * 
440    * @param tkn    the current token
441    * @param c      the current character
442    * @return a valid token object
443    * @throws IOException on invalid state
444    */
445   private Token encapsulatedTokenLexer(Token tkn, int c) throws IOException {
446     // save current line
447     int startLineNumber = getLineNumber();
448     // ignore the given delimiter
449     // assert c == delimiter;
450     for (;;) {
451       c = in.read();
452 
453       if (c == '\\' && strategy.getUnicodeEscapeInterpretation() && in.lookAhead()=='u') {
454         tkn.content.append((char) unicodeEscapeLexer(c));
455       } else if (c == strategy.getEscape()) {
456         tkn.content.append((char)readEscape(c));
457       } else if (c == strategy.getEncapsulator()) {
458         if (in.lookAhead() == strategy.getEncapsulator()) {
459           // double or escaped encapsulator -> add single encapsulator to token
460           c = in.read();
461           tkn.content.append((char) c);
462         } else {
463           // token finish mark (encapsulator) reached: ignore whitespace till delimiter
464           for (;;) {
465             c = in.read();
466             if (c == strategy.getDelimiter()) {
467               tkn.type = TT_TOKEN;
468               tkn.isReady = true;
469               return tkn;
470             } else if (isEndOfFile(c)) {
471               tkn.type = TT_EOF;
472               tkn.isReady = true;
473               return tkn;
474             } else if (isEndOfLine(c)) {
475               // ok eo token reached
476               tkn.type = TT_EORECORD;
477               tkn.isReady = true;
478               return tkn;
479             } else if (!isWhitespace(c)) {
480               // error invalid char between token and next delimiter
481               throw new IOException(
482                       "(line " + getLineNumber()
483                               + ") invalid char between encapsulated token end delimiter"
484               );
485             }
486           }
487         }
488       } else if (isEndOfFile(c)) {
489         // error condition (end of file before end of token)
490         throw new IOException(
491                 "(startline " + startLineNumber + ")"
492                         + "eof reached before encapsulated token finished"
493         );
494       } else {
495         // consume character
496         tkn.content.append((char) c);
497       }
498     }
499   }
500   
501   
502   /**
503    * Decodes Unicode escapes.
504    * 
505    * Interpretation of "\\uXXXX" escape sequences
506    * where XXXX is a hex-number.
507    * @param c current char which is discarded because it's the "\\" of "\\uXXXX"
508    * @return the decoded character
509    * @throws IOException on wrong unicode escape sequence or read error
510    */
511   protected int unicodeEscapeLexer(int c) throws IOException {
512     int ret = 0;
513     // ignore 'u' (assume c==\ now) and read 4 hex digits
514     c = in.read();
515     code.clear();
516     try {
517       for (int i = 0; i < 4; i++) {
518         c  = in.read();
519         if (isEndOfFile(c) || isEndOfLine(c)) {
520           throw new NumberFormatException("number too short");
521         }
522         code.append((char) c);
523       }
524       ret = Integer.parseInt(code.toString(), 16);
525     } catch (NumberFormatException e) {
526       throw new IOException(
527         "(line " + getLineNumber() + ") Wrong unicode escape sequence found '" 
528         + code.toString() + "'" + e.toString());
529     }
530     return ret;
531   }
532 
533   private int readEscape(int c) throws IOException {
534     // assume c is the escape char (normally a backslash)
535     c = in.read();
536     int out;
537     switch (c) {
538       case 'r': out='\r'; break;
539       case 'n': out='\n'; break;
540       case 't': out='\t'; break;
541       case 'b': out='\b'; break;
542       case 'f': out='\f'; break;
543       default : out=c;
544     }
545     return out;
546   }
547   
548   // ======================================================
549   //  strategies
550   // ======================================================
551   
552   /**
553    * Obtain the specified CSV Strategy.  This should not be modified.
554    * 
555    * @return strategy currently being used
556    */
557   public CSVStrategy getStrategy() {
558     return this.strategy;
559   }
560   
561   // ======================================================
562   //  Character class checker
563   // ======================================================
564   
565   /**
566    * @return true if the given char is a whitespace character
567    */
568   private boolean isWhitespace(int c) {
569     return Character.isWhitespace((char) c) && (c != strategy.getDelimiter());
570   }
571   
572   /**
573    * Greedy - accepts \n and \r\n 
574    * This checker consumes silently the second control-character...
575    * 
576    * @return true if the given character is a line-terminator
577    */
578   private boolean isEndOfLine(int c) throws IOException {
579     // check if we have \r\n...
580     if (c == '\r') {
581       if (in.lookAhead() == '\n') {
582         // note: does not change c outside of this method !!
583         c = in.read();
584       }
585     }
586     return (c == '\n');
587   }
588   
589   /**
590    * @return true if the given character indicates end of file
591    */
592   private boolean isEndOfFile(int c) {
593     return c == ExtendedBufferedReader.END_OF_STREAM;
594   }
595 }